# April 8th, 2024
# Damien Babet, Olivier Godechot, Marco Palladino
# Estimations for the paper "In the land of AKM" from Damien Babet, Olivier Godechot and Marco Guido Palladino 
# Version for the "revise and resubmit" of JOLE
# Main changes include :
# - "Year Fixed Effects and AGE-40" specification (from Card et al. 2016)
# - More analysis of the role of occupations
# - Historical series include estimates on the original AKM data

# Estimations use custom functions defined in AKMland_functions.R. You need this
# script in the same working directory as this one:

source("AKMland_functions.R")

# Estimation results are stored in the "saverep" directory. They are then used
# to generate figures (tables and graphs) in the paper, either from the AKMland_figures.R
# script, or manually, as well as additional analysis (regressions of FE, etc.)

saverep <- "./saverep/"


# Data files are generated with SAS scripts at the SAS format, from the Insee servers
# versions. They are stored in the following directory
sasrep <- "./data/sasrep/"

# Files are then translated to the R fst format, with some data preparation. fst files are saved in:
fstrep <- "./data/fstrep/"

# Narrow panel fst file
fstpanel <- "./data/fstpanel/"

# Files are big. This script attempts to minimize RAM usage, at the price of more loading time

#######################
# Estimation settings:#
#######################
# Selection
# Duree min : 90 days / year
# nbheur min : 100 hours / year
# min hourly wage : 80% of hourly smic
# max hourly wage: 1000 times hourly smic
# firm ident, sex, year, age and wage not empty or undefined
# Domempl = 9 ou 8

duree_min <- 360 #included

age_min <- 16 #included
age_max <- 70 #included

size_min <- 1 #minimal number of observations per firm per panel

period_start <- 2002
period_stop <- 2019
panel_length <- 6
panel_starts <- c(2002,2008,2014)

# selection : private firms, including associations (CATJUR=9)
assoc <- TRUE

# baseline estimation: firm-splitting balanced by movers
################################
# FST translation and dataprep #
################################
# Do it only once !
#for (year in period_start:period_stop){
#  fst_translate(year)
#}

####################
# Main estimations #
####################
# 3 periods, comparisons of AKM and two split sampling bias corrections
Mestim <- rotestim(period_length=panel_length,
                    start_year_list=c(2002,2008,2014), 
                    saverep = saverep,
                    method_list=c("AKM","fsplit","psplit"),
                    rotat_year=FALSE,
                    year_method="yearFE",
                    statdes=TRUE,#FALSE,
                    saveopt=TRUE,
                    agemin = age_min,
                    agemax = age_max,
                    dureemin = duree_min,
                    sizemin = size_min,
                    assoc = assoc,
                    save_estim = TRUE)#FALSE)

###################################
# Baseline estimates result files #
###################################
# Each specification (AKM, firm split, etc.) generates its own result files.
# We assemble firm_split and no-plit results in one set of files :
for (period in c("20022007", "20082013", "20142019")){
  dsplit <- read_parquet(paste(saverep, period, "Card_fsplit", ".parquet", sep=""))
  dnosplit <- read_parquet(paste(saverep, period, "Card_nosplit", ".parquet", sep=""))
  
  splitvar <- setdiff(names(dsplit), names(dnosplit))
  d <- cbind(dnosplit, dsplit[,..splitvar])
  write_parquet(
    x = d,
    sink = paste(saverep, period, "_two_methods.parquet", sep=""))
}

baseline_files <- c("20022007_two_methods",
                    "20082013_two_methods",
                    "20142019_two_methods")

#############################################################
# Descriptive statistics for whole panel and connected comp #
#############################################################
for (f in baseline_files){
  d <- read_parquet(paste(saverep, f, ".parquet", sep=""))
  all_panel_stat_des <- detailed_statdes(d)
  
  AKM_comp_stat_des <- detailed_statdes(d[!(is.na(ffe_all)),])
  
  split_comps_stat_des <- detailed_statdes(d[((!is.na(ffe1))&(!is.na(ffe0))),])
  
  compstat <- merge(merge(all_panel_stat_des, AKM_comp_stat_des, by = c("variable","value")), 
                    split_comps_stat_des, by = c("variable","value"))
   
  names(compstat) <- c("variable","value","Panel","AKM_comp","Split_comps")
  
  write.csv(compstat, file = paste(saverep, "stat_des_comparison",f,".csv", sep=""))
  rm(d)
  gc()
  }


###########################################################
# Decomposition split firm with estimated varWFE and varu #
###########################################################
for (f in baseline_files){
  d <- read_parquet(paste(saverep, f, ".parquet", sep=""))
  decomp_f <- split_fast_decompo(d,y_name="lnhwage", method="Card", 
                              split_method="by_firm_movers", approx_u=TRUE)
  write.csv(decomp_f, file = paste(saverep, "decompo_with_approx_",f,".csv", sep=""))
}

rm(d)
gc()


##################################################
# FE regressions on occupation, sex and industry #
##################################################

d <- read_parquet(paste(saverep, "20022007Card_fsplit.parquet", sep=""))
d <- data.table(d)
d <- cleanestim(d, split_correction = TRUE)
fereg_period(d, yearref="02", savename="20022007Card_fsplit")

d <- read_parquet(paste(saverep, "20142019Card_fsplit.parquet", sep=""))
d <- data.table(d)
d <- cleanestim(d, split_correction = TRUE)
fereg_period(d, yearref="14", savename="20142019Card_fsplit")

rm(d)
gc()

###############################
# Occupation FE decomposition #
###############################
for (f in baseline_files){
  d <- read_parquet(paste(saverep, f, ".parquet", sep=""))
  d <- data.table(d)
  d <- cleanestim(d, split_correction = TRUE)
  d <- occ_FE(d, saveref=f)
  occstat <- keepoccstat(d)
  write.csv(occstat, file = paste(saverep, "occstat_",f,".csv", sep=""))
  dec <- occ_decomp(d)
  write.csv(dec, file = paste(saverep, "occdecomp_",f,".csv", sep=""))
  rm(d)
  gc()
}

##################
# Wage quantiles #
##################
q <- period_quantile(2002, 2019)

save(q, file = paste(saverep, "lnhwage_quantiles02_19.RData", sep=""))
write.csv(q, file = paste(saverep, "lnhwage_quantiles02_19.csv", sep=""))





####################
# ANNEXES ANALYSIS #
####################
################################
# Firms with 20+ obs per panel #
################################
pestim20 <- rotestim(period_length=panel_length,
                      start_year_list=panel_starts, 
                      saverep = saverep,
                      method_list=c("AKM"),
                      rotat_year=FALSE,
                      year_method="yearFE",
                      statdes=TRUE,
                      saveopt=TRUE,
                      agemin = age_min,
                      agemax = age_max,
                      dureemin = duree_min,
                      sizemin = 20,
                      assoc = assoc,
                      savenametag="_20+_")
########################################
# Establishment with 20+ fixed effects #
########################################
pestimestab20 <- rotestim(period_length=panel_length,
                         start_year_list=panel_starts, 
                         saverep = saverep,
                         method_list=c("AKM"),
                         estab = TRUE,
                         rotat_year=FALSE,
                         year_method="yearFE",
                         statdes=TRUE,
                         saveopt=TRUE,
                         agemin = age_min,
                         agemax = age_max,
                         dureemin = duree_min,
                         sizemin = 20,
                         assoc = assoc,
                         savenametag="_establishment20+_")
#Choosing the firm split correction as a baseline
#################################################
# Firms with 20+ obs per panel split correction #
#################################################
fpestim20 <- rotestim(period_length=panel_length,
                      start_year_list=panel_starts, 
                      saverep = saverep,
                      method_list=c("fsplit"),
                      rotat_year=FALSE,
                      year_method="yearFE",
                      statdes=TRUE,
                      saveopt=TRUE,
                      agemin = age_min,
                      agemax = age_max,
                      dureemin = duree_min,
                      sizemin = 20,
                      assoc = assoc,
                      savenametag="_20+_")
################################################
# Establishment fixed effects split correction #
################################################
fpestimestab <- rotestim(period_length=panel_length,
                      start_year_list=panel_starts, 
                      saverep = saverep,
                      method_list=c("fsplit"),
                      estab = TRUE,
                      rotat_year=FALSE,
                      year_method="yearFE",
                      statdes=TRUE,
                      saveopt=TRUE,
                      agemin = age_min,
                      agemax = age_max,
                      dureemin = duree_min,
                      sizemin = size_min,
                      assoc = assoc,
                      savenametag="_establishment_")
#############################################################
# back to original specification : lnwage centered per year #
#############################################################
fpestimYFE <- rotestim(period_length=panel_length,
                      start_year_list=panel_starts, 
                      saverep = saverep,
                      method_list=c("fsplit"),
                      rotat_year=FALSE,
                      year_method="yearlymean",
                      statdes=FALSE,
                      saveopt=TRUE,
                      agemin = age_min,
                      agemax = age_max,
                      dureemin = duree_min,
                      sizemin = size_min,
                      assoc = assoc)

##########################
# DUREE selection at 90+ #
##########################
fpestim90 <- rotestim(period_length=panel_length,
                       start_year_list=panel_starts, 
                       saverep = saverep,
                       method_list=c("fsplit"),
                       rotat_year=FALSE,
                       year_method="yearFE",
                       statdes=TRUE,
                       saveopt=TRUE,
                       agemin = age_min,
                       agemax = age_max,
                       dureemin = 90,
                       sizemin = size_min,
                       assoc = assoc,
                       savenametag="_90days_")


#########################
# BLM firm-clusters AKM #
#########################
clestim <- rotestim(period_length=panel_length,
                    start_year_list=panel_starts, 
                    saverep = saverep,
                    method_list=c("AKM"),
                    cluster = TRUE,
                    rotat_year=FALSE,
                    year_method="yearFE",
                    statdes=FALSE,
                    saveopt=TRUE,
                    agemin = age_min,
                    agemax = age_max,
                    dureemin = duree_min,
                    sizemin = 1,
                    assoc = assoc,
                    savenametag="_clusters_")



##################
# Rolling panels #
##################
# rolling 6-years panels to get a detailed historical profile
rot6estimAKM <- rotestim(period_length=panel_length,
                      start_year_list=2002:2014, 
                      saverep = saverep,
                      method_list=c("AKM"),
                      rotat_year=TRUE,
                      year_method="yearFE",
                      statdes=FALSE,
                      saveopt=TRUE,
                      agemin = age_min,
                      agemax = age_max,
                      dureemin = duree_min,
                      sizemin = size_min,
                      assoc = assoc,
                      savenametag="_rollingAKM_")

rot6estim <- rotestim(period_length=panel_length,
                      start_year_list=2002:2014, 
                      saverep = saverep,
                      method_list=c("fsplit"),
                      rotat_year=TRUE,
                      year_method="yearlymean",
                      statdes=FALSE,
                      saveopt=TRUE,
                      agemin = age_min,
                      agemax = age_max,
                      dureemin = duree_min,
                      sizemin = size_min,
                      assoc = assoc)

# rolling 6-years panels with year fixed effects
rot6estimYFE <- rotestim(period_length=panel_length,
                      start_year_list=2002:2014, 
                      saverep = saverep,
                      method_list=c("fsplit"),
                      rotat_year=TRUE,
                      year_method="yearFE",
                      statdes=FALSE,
                      saveopt=TRUE,
                      agemin = age_min,
                      agemax = age_max,
                      dureemin = duree_min,
                      sizemin = size_min,
                      assoc = assoc)

# One 18-years panels with year fixed effects
OnestimYFE <- rotestim(period_length=18,
                         start_year_list=c(2002), 
                         saverep = saverep,
                         method_list=c("fsplit","psplit"),
                         rotat_year=TRUE,
                         year_method="yearFE",
                         statdes=TRUE,
                         saveopt=TRUE,
                         agemin = age_min,
                         agemax = age_max,
                         dureemin = duree_min,
                         sizemin = size_min,
                         assoc = assoc)

############################################################
# Multi random splits for estimation of error due to split #
############################################################

multi02 <- multisplit(year=2002, iter=4,
                      year_method="yearFE",)
gc()

multi14 <- multisplit(year=2014, iter=4,
                      year_method="yearFE",)
gc()

#####################################
# Historical series on narrow panel #
#####################################
#The historical series data is confidential and not available in this package
#There is no simulated data for the historical series
#For this reason, this part of the code is commented out

#Translate sas file into .fst. Do it only once!
#tic("load Panel from SAS file")
#paneldt <- as.data.table(read_sas("./data/sasrep/labor_market_data_i.sas7bdat"))
#toc()

#panel <- hist_dataprep(paneldt)

#write.fst(panel, paste(fstpanel, "narrow_panel.fst", sep=""))

#tic("load panel .fst file")
#panel <- as.data.table(read.fst(paste(fstpanel, "narrow_panel.fst", sep="")))
#toc()

# Hourly wage series: gross hourly wage
#hist_rotmulti(iter=20,
#              yearlist=c(1996:2014),
#              p=panel_length,
#              wage="hourly",
#              agemin=age_min, 
#              agemax=age_max,
#              dureemin=90,
#              sizemin=1)

# Daily wage series: gross daily wage
# hist_rotmulti(iter=20,
#               yearlist=c(1976:2014),
#               p=panel_length,
#               wage="daily",
#               agemin=age_min, 
#               agemax=age_max,
#               dureemin=90,
#               sizemin=1)

# Hourly wage series: net hourly wage
# hist_rotmulti(iter=20,
#               yearlist=c(1996:2014),
#               p=panel_length,
#               wage="hourlynet",
#               agemin=age_min, 
#               agemax=age_max,
#               dureemin=90,
#               sizemin=1)

# Daily wage series: net daily wage
# hist_rotmulti(iter=20,
#               yearlist=c(1976:2014),
#               p=panel_length,
#               wage="dailynet",
#               agemin=age_min, 
#               agemax=age_max,
#               dureemin=90,
#               sizemin=1)

# Historic uncorrected AKM

# Hourly wage series: gross hourly wage
# hist_rotmulti(iter=1,
#               yearlist=c(1996:2014),
#               p=panel_length,
#               wage="hourly",
#               metho= "AKM",
#               agemin=age_min, 
#               agemax=age_max,
#               dureemin=90,
#               sizemin=1)

# Daily wage series: gross daily wage
# hist_rotmulti(iter=1,
#               yearlist=c(1976:2014),
#               p=panel_length,
#               wage="daily",
#               metho= "AKM",
#               agemin=age_min, 
#               agemax=age_max,
#               dureemin=90,
#               sizemin=1)

# Hourly wage series: net hourly wage
# hist_rotmulti(iter=1,
#               yearlist=c(1996:2014),
#               p=panel_length,
#               wage="hourlynet",
#               metho= "AKM",
#               agemin=age_min, 
#               agemax=age_max,
#               dureemin=90,
#               sizemin=1)

# Daily wage series: net daily wage
# hist_rotmulti(iter=1,
#               yearlist=c(1976:2014),
#               p=panel_length,
#               wage="dailynet",
#               metho= "AKM",
#               agemin=age_min, 
#               agemax=age_max,
#               dureemin=90,
#               sizemin=1)


###############
# Simulations #
###############
res <- simcomp(startyear=2002)
gc()

resCard <- simcomp(startyear=2002, year_method = "yearFE")
gc()

#####################################
# Firmsize categories decomposition #
#####################################

size_res <- size_decomp(startyearlist=c(2002,2014), 
                        p=panel_length, 
                        estab=FALSE, 
                        year_method="yearFE", 
                        agemin= age_min, 
                        agemax=age_max,
                        dureemin=duree_min, 
                        sizemin=size_min, 
                        assoc=assoc)
gc()

# Movers per size category per sample

for (f in baseline_files){
  d <- data.table(read_parquet(paste(saverep, f, ".parquet", sep="")))
  c <- centrality_stat(d)
  write.csv(c, file = paste(saverep, f, "centrality_stat.csv", sep=""))
  rm(d)
  gc()
}


for (f in baseline_files){
  d <- data.table(read_parquet(paste(saverep, f, ".parquet", sep="")))
  d[, firmsize_cat:=cut(firmsize, breaks=c(0,20,200,1000,Inf))]
  # n. of unique firms by individual
  d[, nbfirms := (length(unique(firm_id))), by = indiv_id]
  d[, mover := nbfirms>1]
  # To compute the number of connections to other firms each firm has
  d[, nbconnect := sum(nbfirms-1), by=firm_id]
  sizeconnect <- d[,.('nobs'=.N, 
                      'nmovers'=sum(mover),
                      'nconnect'=mean(nbconnect)), by=firmsize_cat]
  sizeconnect$sample <- "All sample"
  sizeconnectAKM <- d[!is.na(ffe_all),
                      .('nobs'=.N,
                        'nmovers'=sum(mover),
                        'nconnect'=sum(nbconnect)), by=firmsize_cat]
  sizeconnectAKM$sample <- "AKM sample"
  sizeconnectsplit <- d[((!is.na(ffe1))&(!is.na(ffe0))),
                        .('nobs'=.N,
                          'nmovers'=sum(mover),
                          'nconnect'=sum(nbconnect)), by=firmsize_cat]
  sizeconnectsplit$sample <- "Split sample"
  
  sizeconnect <- rbind(sizeconnect, sizeconnectAKM, sizeconnectsplit)

  write.csv(sizeconnect, file = paste(saverep, f, "movers_per_firmsize_cat.csv", sep=""))
  }
